# COMPSCI 389: Introduction to Machine Learning
Data Cleaning Introduction

The code below runs gradient descent to minimize the sample mean squared error when using a linear parametric model, with the second-degree (order) polynomial basis. The first code block defines the various functions for this. I recommend skipping down to the next markdown block.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

# Function to calculate mean squared error (for evaluation)
def mean_squared_error(predictions, labels):
    return np.mean((predictions - labels) ** 2)

# Function to calculate gradients
def compute_gradients(X, y, weights):
    predictions = X.dot(weights)
    errors = predictions - y
    return 2 / X.shape[0] * X.T.dot(errors)

class PolynomialRegressionGD(BaseEstimator):
    def __init__(self, learning_rate, iterations=1000, polynomial_degree=2):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.polynomial_degree = polynomial_degree

    def fit(self, X, y):
        # Expand features into polynomial basis and store the transformer
        self.poly = PolynomialFeatures(degree=self.polynomial_degree)
        X_poly = self.poly.fit_transform(X) # X_poly now holds the phi(X) - the features for each input

        # Get the number of features
        numFeatures = X_poly.shape[1];

        # Initialize weights and loss history
        self.weights = np.zeros(numFeatures)
        self.loss_history = []

        # Print the initial loss
        predictions = X_poly.dot(self.weights)
        loss = mean_squared_error(predictions, y)
        print(f"Iteration 0/{self.iterations}, Loss: {loss:.4f}")

        for i in range(1, self.iterations + 1):
            # Compute the gradient of the loss function
            gradients = compute_gradients(X_poly, y, self.weights)

            # Update the weights using gradient descent
            self.weights -= self.learning_rate * gradients

            # Compute, print, and store the resulting loss (just to print it -  not needed for the actual optimization)
            loss = mean_squared_error(X_poly.dot(self.weights), y)
            self.loss_history.append(loss)
            print(f"Iteration {i}/{self.iterations}, Loss: {loss:.4f}")

        return self

    def predict(self, X):
        # Use the stored polynomial transformer to transform X
        X_poly = self.poly.transform(X)
        return X_poly.dot(self.weights)

# Load the data set
df = pd.read_csv("data/GPA.csv", delimiter=',')

# Split the data into features and labels
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=True)

def run(alpha):
    iterations = 1000
    polynomial_degree = 2

    # Initialize and fit the model
    model = PolynomialRegressionGD(
        learning_rate=alpha,
        iterations=iterations,
        polynomial_degree=polynomial_degree
    )
    model.fit(X_train, y_train)

    # Plotting the loss over iterations
    plt.plot(range(1, iterations + 1), model.loss_history)
    plt.xlabel('Iterations')
    plt.ylabel('Mean Squared Error')
    plt.yscale('log')
    plt.title(f'Gradient Descent Loss, Polynomial Degree: {polynomial_degree}')
    plt.show()

    # Predict on the test set
    predictions = model.predict(X_test)

    # Calculate MSE on the test set
    mse_test = mean_squared_error(predictions, y_test)
    print(f"Test MSE: {mse_test:.4f}")

    # Calculate the standard error of the MSE
    squared_errors = (predictions - y_test) ** 2
    std_error = np.std(squared_errors) / np.sqrt(len(squared_errors))
    print(f"Standard Error of MSE: {std_error:.4f}")


The `run` function takes the step size (learning rate) `alpha` as its one argument. It then runs 1,000 iterations of gradient descent on the GPA data set using the second-degree polynomial basis. Let's recreate the plot from the last lecture!

(Note: nan and inf can cause some errors, so don't worry if you see some errors below.)

In [2]:
alpha = 0.1
run(alpha)

Iteration 0/1000, Loss: 8.4113
Iteration 1/1000, Loss: 8645455371891435502567424.0000
Iteration 2/1000, Loss: 9821396341227359415106514824228002805203228164096.0000
Iteration 3/1000, Loss: 11157299069425981055503784107219233318783702966848703261111102547907575808.0000
Iteration 4/1000, Loss: 12674910797929674075228156053281642616283134365939844551412474839277991917353638059903144974876672.0000
Iteration 5/1000, Loss: 14398947517299167407550497505246322075952381197770982098586905489591436832475176787184199521480879647268266734056479129600.0000
Iteration 6/1000, Loss: 16357487079104423611885178902975050684536510448038113352447134890809632469808827483268807956746632672452405915516016678365824317342362684352364544.0000
Iteration 7/1000, Loss: 18582426473991081852760479589591014943991292799454783577765453974269211355169246226667864449020331535472153550554524194528209948852239401575401687904143601001925439389696.0000
Iteration 8/1000, Loss: 211100013095770846237279693954483984797374193342923

  self.weights -= self.learning_rate * gradients


Iteration 189/1000, Loss: nan
Iteration 190/1000, Loss: nan
Iteration 191/1000, Loss: nan
Iteration 192/1000, Loss: nan
Iteration 193/1000, Loss: nan
Iteration 194/1000, Loss: nan
Iteration 195/1000, Loss: nan
Iteration 196/1000, Loss: nan
Iteration 197/1000, Loss: nan
Iteration 198/1000, Loss: nan
Iteration 199/1000, Loss: nan
Iteration 200/1000, Loss: nan
Iteration 201/1000, Loss: nan
Iteration 202/1000, Loss: nan
Iteration 203/1000, Loss: nan
Iteration 204/1000, Loss: nan
Iteration 205/1000, Loss: nan
Iteration 206/1000, Loss: nan
Iteration 207/1000, Loss: nan
Iteration 208/1000, Loss: nan
Iteration 209/1000, Loss: nan
Iteration 210/1000, Loss: nan
Iteration 211/1000, Loss: nan
Iteration 212/1000, Loss: nan
Iteration 213/1000, Loss: nan
Iteration 214/1000, Loss: nan
Iteration 215/1000, Loss: nan
Iteration 216/1000, Loss: nan
Iteration 217/1000, Loss: nan
Iteration 218/1000, Loss: nan
Iteration 219/1000, Loss: nan
Iteration 220/1000, Loss: nan
Iteration 221/1000, Loss: nan
Iteration 

  ticklocs = b ** decades


OverflowError: cannot convert float infinity to integer

<Figure size 640x480 with 1 Axes>

Test MSE: nan
Standard Error of MSE: nan


**Question**: What went wrong, and how can we fix it?